1 Introduction

This R Markdown file explores data regarding internet-connected devices with via the Shodan API.

1.1 Dependencies

  • R version: R version 4.4.2 (2024-10-31)
  • Required Libraries: dplyr, ggplot2, tidyr, httr, jsonlite, stringr, ggthemes, renv, plotly, htmltools, maps, knitr, kableExtra
#Load required libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(httr)
library(jsonlite)
library(stringr)
library(ggthemes)
library(renv)
library(plotly)
library(htmltools)
library(maps)
library(knitr)
library(kableExtra)

2 Input

2.1 Query Data from Shodan API

# Shodan api key and endpoint
api_key <- Sys.getenv("SHODAN_API_KEY") # Enter your API key here
api_url <- "https://api.shodan.io/shodan/host/search"

# Parameters to query
params <- list(
  key = api_key,
  query = "has_screenshot:true encrypted" # ransomware related query
)

3 Retrieve Data from Shodan API

page <- 1  # Start with the first page

# Begin the loop to retrieve the data
repeat {
  params$page <- page  # Set the page number

  # Print the page number
  cat("Retrieving page", page, "of the Shodan dataset...\n")
  flush.console()  # Ensures immediate output

  # Send the GET request
  response <- GET(api_url, query = params)

  # Return the error message if the status code is not 200
  if (response$status_code != 200) {
    stop(content(response, "text", encoding = "UTF-8"))
  }

  # Parse the JSON response
  shodan_data <- fromJSON(content(response, "text", encoding = "UTF-8"))
  max_pages <- ceiling(shodan_data$total / 100)  # Calculate the maximum number of pages)

  # Create the dataframe
  shodan_df <- as.data.frame(shodan_data$matches)
  
  # Append the data from the current page to the existing dataframe
  if (page == 1) {
    shodan_df_all <- shodan_df
  } else {
    shodan_df_all <- bind_rows(shodan_df_all, shodan_df)
  }

  Sys.sleep(1)  # Sleep for 1 second to avoid rate limiting
  
  # Stop after retrieving the maximum number of pages  
  if (page >= max_pages) {
    message("Reached the maximum number of pages. Stopping.")
    break
  }

  page <- page + 1  # Increment the page number
}
## Retrieving page 1 of the Shodan dataset...
## Retrieving page 2 of the Shodan dataset...
## Reached the maximum number of pages. Stopping.

4 Tidy Data

# Select interesting columns
shodan_df_ransomware <- shodan_df_all %>%
  select(ip_str, port, transport, product, os, location, screenshot)
# Unnest nested columns
shodan_df_ransomware <- shodan_df_ransomware %>%
  unnest(`screenshot`) %>%
  unnest(`location`)

# Show Column names
colnames(shodan_df_ransomware)

# Select interesting columns from unnested dataframe
shodan_df_ransomware <- shodan_df_ransomware %>%
  select(ip_str, port, transport, product,os, country_name, country_code, city, 
         longitude, latitude, text)

# Rename the columns
colnames(shodan_df_ransomware) <- c("IP Address", "Port", "Transport", "Service", 
                                    "Operating System", "Country", "Country Code",
                                    "City", "Longitude", "Latitude","Ransom Letter")

# Group by Country Code
shodan_df_ransomware <- shodan_df_ransomware %>%
  filter(!is.na(`Ransom Letter`)) %>% # Only keep rows with ransom letters
  group_by(`Country Code`) %>%
  # Arrange by Country
  arrange(Country)

5 Data Analysis of Ransomware Infections

# Create a frequency table with the counts
common_country_count <- table(shodan_df_ransomware$Country)
common_country_count <- sort(common_country_count, decreasing = TRUE) # Sort the count in descending order

# Count the number of times values in Country appear
shodan_count <- shodan_df_ransomware %>%
  group_by(`Country Code`, `City`, `Longitude`, `Latitude`) %>%
  count(Country)


# Get the names of the counts
common_country_names <- names(common_country_count)

# Get the most common country
most_common_country <- common_country_names[common_country_count == max(common_country_count)]
# Collapse the most common country into a single string
most_common_country <- paste(most_common_country, collapse = ", ")


# Output the most common country
# If the most common country is the United States
if (most_common_country == "United States") {
  cat("According to the Shodan dataset, the", most_common_country,
      "is the country with the highest number of ransomware infections,", "with", 
    max(common_country_count),"incidents.", 
    
    # Display the total number of ransomware infections
     "There are a total of", nrow(shodan_df_ransomware), "ransomware infections worldwide!", "\n",
     "\n",
    
    # Statistical Analysis
    "The average number of ransomware infections per country is", 
    round(mean(common_country_count), 2), "\n",  # Average
    
    "The median number of ransomware infections per country is",
    median(common_country_count), "\n", # Median
    
    "The standard deviation of ransomware infections per country is",
    round(sd(common_country_count), 2), "\n") # Standard Deviation
  
} else {
  # All other countries
  cat("According to the Shodan dataset,", most_common_country,
    "is the country with the highest number of ransomware infections,", "with", 
    max(common_country_count),"incidents.",
    
# Display the total number of ransomware infections
   "There are a total of", nrow(shodan_df_ransomware), "ransomware infections worldwide!", "\n",
   "\n",

  # Statistical Analysis
  "The average number of ransomware infections per country is", 
  round(mean(common_country_count), 2), "\n",  # Average

  "The median number of ransomware infections per country is",
  median(common_country_count), "\n", # Median

  "The standard deviation of ransomware infections per country is",
  round(sd(common_country_count), 2), "\n") # Standard Deviation
}
## According to the Shodan dataset, the United States is the country with the highest number of ransomware infections, with 12 incidents. There are a total of 114 ransomware infections worldwide! 
##  
##  The average number of ransomware infections per country is 3 
##  The median number of ransomware infections per country is 1 
##  The standard deviation of ransomware infections per country is 3.16

5.1 Table of Ransomware Infections by Country

# Table of Ransomware Infections by Country
kable(common_country_count, caption = "Ransomware Infections by Country",
      col.names = c("Country", "Number of Infections"), format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  scroll_box(width = "100%", height = "500px") # Add a scroll box
Ransomware Infections by Country
Country Number of Infections
United States 12
Germany 11
Brazil 10
Mexico 9
China 8
Russian Federation 7
Spain 5
Turkey 5
Argentina 4
Czechia 4
India 4
Colombia 3
Pakistan 3
Chile 2
Kazakhstan 2
Singapore 2
Ukraine 2
Bahrain 1
Bangladesh 1
Belarus 1
Bulgaria 1
Canada 1
Denmark 1
Egypt 1
Finland 1
France 1
Ghana 1
Japan 1
Lithuania 1
Nigeria 1
Panama 1
Peru 1
Portugal 1
Serbia 1
South Africa 1
Taiwan 1
Uzbekistan 1
Viet Nam 1

6 Data Visualization of Ransomware Infections

6.1 Ransomware Infections by Country and City (ggplot2)

# Create a world map of ransomware infections
ggplot(shodan_count, aes(x = Longitude, y = Latitude, color = `City`,
                                 size = n)) +
  borders("world", colour = "gray50", fill = "gray50") +
  # Remove Antarctica
  #coord_quickmap(xlim = c(-180, 180), ylim = c(-60, 90)) +
  geom_point() +
  theme_map() +
  labs(title = "Ransomware Infections by Country and City",
       caption = "Source: Shodan API",
       x = "Longitude",
       y = "Latitude",
       color = "Country Code") +
  theme_fivethirtyeight() +
  # Remove the gridlines and axis labels
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        legend.position = "none", # Removes the fill legend
        plot.title = element_text(hjust = 0.5)) # Center the title

6.2 Interactive Ransomware Infections Map (plotly)

# Make the map interactive
p <- ggplot(shodan_count, aes(x = Longitude, y = Latitude, color = `City`,
                                      size = n)) +
  borders("world", colour = "gray50", fill = "gray50") +
  geom_point() +
  theme_map() +
  labs(title = "Ransomware Infections by Country and City",
       caption = "Source: Shodan API",
       x = "Longitude",
       y = "Latitude",
       color = "Country Code") +
  theme_fivethirtyeight() +
  # Remove the gridlines and axis labels
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        legend.position = "none", # Removes the fill legend
        plot.title = element_text(hjust = 0.5)) # Center the title

ggplotly(p)